MyData <- read.csv('/Users/jesusleon/Downloads/london_merged.csv', row.names=1)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
time= as_datetime(rownames(MyData[0]))
MyData<-data.frame(time, MyData)

#stationarity

library(tseries)
adf.test(MyData$cnt) # p-value < 0.05 indicates the TS is stationary
## Warning in adf.test(MyData$cnt): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  MyData$cnt
## Dickey-Fuller = -9.9098, Lag order = 25, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(MyData$cnt)
## Warning in kpss.test(MyData$cnt): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  MyData$cnt
## KPSS Level = 2.8862, Truncation lag parameter = 14, p-value = 0.01

So we can conclude that the time series is stationanry

###Regression model with trend, seasonality, and white noise to make future predictions

library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('lubridate')
library('forecast')

#creating a time series with xts library and making a zoo object

library('TSstudio')
library('zoo')
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library('xts')
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last

#linear regression forecasting

#creating series trend
MyData$trend <- 1:nrow(MyData)
#creating seasonality based on hours 
MyData$seasonal <- factor(hour(MyData$time), ordered = FALSE)

setting a testing partition length

 h <- length(MyData$time)/8 # setting a testing partition length
   train <- MyData[1:(nrow(MyData) - h), ]
   test <- MyData[(nrow(MyData) - h + 1):nrow(MyData), ]

#TREND

md_trend <- lm(cnt ~ trend, data = train)
summary(md_trend)
## 
## Call:
## lm(formula = cnt ~ trend, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1295.5  -883.6  -297.3   535.6  6767.8 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.940e+02  1.772e+01   56.09   <2e-16 ***
## trend       2.195e-02  2.014e-03   10.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1094 on 15235 degrees of freedom
## Multiple R-squared:  0.007737,   Adjusted R-squared:  0.007672 
## F-statistic: 118.8 on 1 and 15235 DF,  p-value: < 2.2e-16

#SEASONALITY

md_seasonality <- lm(cnt ~ seasonal, data = train)
summary(md_seasonality)
## 
## Call:
## lm(formula = cnt ~ seasonal, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2829.8  -245.4   -21.3   246.6  4969.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   296.63      27.24  10.890  < 2e-16 ***
## seasonal1     -95.06      38.52  -2.468   0.0136 *  
## seasonal2    -158.37      38.57  -4.106 4.04e-05 ***
## seasonal3    -200.67      38.57  -5.203 1.99e-07 ***
## seasonal4    -222.29      38.57  -5.764 8.39e-09 ***
## seasonal5    -184.94      38.58  -4.793 1.66e-06 ***
## seasonal6     175.01      38.51   4.545 5.53e-06 ***
## seasonal7    1183.42      38.51  30.733  < 2e-16 ***
## seasonal8    2598.20      38.54  67.421  < 2e-16 ***
## seasonal9    1358.63      38.49  35.297  < 2e-16 ***
## seasonal10    774.31      38.52  20.101  < 2e-16 ***
## seasonal11    862.79      38.48  22.424  < 2e-16 ***
## seasonal12   1151.20      38.46  29.931  < 2e-16 ***
## seasonal13   1229.11      38.48  31.945  < 2e-16 ***
## seasonal14   1197.62      38.48  31.126  < 2e-16 ***
## seasonal15   1296.46      38.46  33.708  < 2e-16 ***
## seasonal16   1614.87      38.45  42.003  < 2e-16 ***
## seasonal17   2593.50      38.48  67.406  < 2e-16 ***
## seasonal18   2401.95      38.48  62.427  < 2e-16 ***
## seasonal19   1407.11      38.49  36.557  < 2e-16 ***
## seasonal20    795.80      38.49  20.675  < 2e-16 ***
## seasonal21    463.30      38.51  12.032  < 2e-16 ***
## seasonal22    306.15      38.52   7.948 2.04e-15 ***
## seasonal23    151.19      38.55   3.922 8.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 685.9 on 15213 degrees of freedom
## Multiple R-squared:  0.6104, Adjusted R-squared:  0.6098 
## F-statistic:  1036 on 23 and 15213 DF,  p-value: < 2.2e-16
length(md_seasonality$residuals)
## [1] 15237
md_trend.2 <- lm(cnt ~ trend + seasonal + t1 + t2+ hum + wind_speed + factor(weather_code) + factor(season)+ I(is_weekend) +I(is_holiday), data = train)
summary(md_trend.2)
## 
## Call:
## lm(formula = cnt ~ trend + seasonal + t1 + t2 + hum + wind_speed + 
##     factor(weather_code) + factor(season) + I(is_weekend) + I(is_holiday), 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2957.2  -283.9   -24.9   268.5  4258.5 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             9.958e+02  5.437e+01  18.316  < 2e-16 ***
## trend                   2.066e-03  1.155e-03   1.789 0.073578 .  
## seasonal1              -6.979e+01  3.243e+01  -2.152 0.031418 *  
## seasonal2              -1.141e+02  3.249e+01  -3.512 0.000446 ***
## seasonal3              -1.407e+02  3.250e+01  -4.328 1.51e-05 ***
## seasonal4              -1.437e+02  3.252e+01  -4.417 1.01e-05 ***
## seasonal5              -9.843e+01  3.255e+01  -3.024 0.002502 ** 
## seasonal6               2.414e+02  3.254e+01   7.418 1.26e-13 ***
## seasonal7               1.207e+03  3.253e+01  37.093  < 2e-16 ***
## seasonal8               2.559e+03  3.258e+01  78.547  < 2e-16 ***
## seasonal9               1.251e+03  3.271e+01  38.245  < 2e-16 ***
## seasonal10              5.895e+02  3.302e+01  17.852  < 2e-16 ***
## seasonal11              6.157e+02  3.333e+01  18.472  < 2e-16 ***
## seasonal12              8.597e+02  3.358e+01  25.598  < 2e-16 ***
## seasonal13              9.083e+02  3.378e+01  26.890  < 2e-16 ***
## seasonal14              8.587e+02  3.385e+01  25.365  < 2e-16 ***
## seasonal15              9.414e+02  3.381e+01  27.843  < 2e-16 ***
## seasonal16              1.276e+03  3.367e+01  37.887  < 2e-16 ***
## seasonal17              2.287e+03  3.349e+01  68.286  < 2e-16 ***
## seasonal18              2.128e+03  3.316e+01  64.183  < 2e-16 ***
## seasonal19              1.189e+03  3.289e+01  36.164  < 2e-16 ***
## seasonal20              6.240e+02  3.267e+01  19.096  < 2e-16 ***
## seasonal21              3.501e+02  3.252e+01  10.766  < 2e-16 ***
## seasonal22              2.388e+02  3.246e+01   7.357 1.98e-13 ***
## seasonal23              1.174e+02  3.246e+01   3.618 0.000298 ***
## t1                      7.531e+01  6.457e+00  11.664  < 2e-16 ***
## t2                     -3.200e+01  5.200e+00  -6.154 7.73e-10 ***
## hum                    -1.308e+01  5.222e-01 -25.055  < 2e-16 ***
## wind_speed             -9.147e+00  6.939e-01 -13.182  < 2e-16 ***
## factor(weather_code)2   3.929e+01  1.330e+01   2.954 0.003146 ** 
## factor(weather_code)3   5.674e-02  1.476e+01   0.004 0.996933    
## factor(weather_code)4  -4.966e+01  2.002e+01  -2.481 0.013123 *  
## factor(weather_code)7  -2.433e+02  1.801e+01 -13.509  < 2e-16 ***
## factor(weather_code)10 -6.386e+02  1.551e+02  -4.118 3.84e-05 ***
## factor(weather_code)26  1.368e+02  8.209e+01   1.666 0.095657 .  
## factor(season)1         1.310e+01  1.672e+01   0.784 0.433211    
## factor(season)2         2.741e+01  1.561e+01   1.756 0.079133 .  
## factor(season)3        -6.822e+01  1.409e+01  -4.841 1.30e-06 ***
## I(is_weekend)          -2.099e+02  1.045e+01 -20.087  < 2e-16 ***
## I(is_holiday)          -2.768e+02  3.353e+01  -8.256  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 577.3 on 15197 degrees of freedom
## Multiple R-squared:  0.7242, Adjusted R-squared:  0.7235 
## F-statistic:  1023 on 39 and 15197 DF,  p-value: < 2.2e-16
train$yhat <- predict(md_trend.2, newdata = train)
test$yhat <- predict(md_trend.2, newdata = test)
#MyData
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
   plot_lm <- function(data, train, test, title = NULL){
    p <- plot_ly(data = data,
            x = ~ time,
            y = ~ cnt,
            type = "scatter",
            mode = "line",
            name = "Actual") %>%
      add_lines(x =  ~ train$time,
                y = ~ train$yhat,
                line = list(color = "red"),
                name = "Fitted") %>%
      add_lines(x =  ~ test$time,
                y = ~ test$yhat,
                line = list(color = "green", dash = "dot", width = 3),
                name = "Forecasted") %>%
      layout(title = title,
             xaxis = list(title = "Year"),
             yaxis = list(title = "Number of Rides booked"),
             legend = list(x = 0.05, y = 0.95))
return(p) }
plot_lm(data = MyData,
            train = train,
            test = test,
            title = "Predicting the Trend (Polynomial) and Seasonal Components of the Series")

#Accuracy of Model: 1.30% Prediction Error on the testing set

mape_md2 <- c(mean(abs(train$cnt - train$yhat) / train$cnt),
                 mean(abs(test$cnt - test$yhat) / test$cnt))
mape_md2
## [1]      Inf 1.202572

#ANOVA

anova(md_trend.2)
## Analysis of Variance Table
## 
## Response: cnt
##                         Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## trend                    1 1.4210e+08  142102653  426.354 < 2.2e-16 ***
## seasonal                23 1.1210e+10  487405330 1462.375 < 2.2e-16 ***
## t1                       1 1.0756e+09 1075581828 3227.097 < 2.2e-16 ***
## t2                       1 2.2678e+07   22678056   68.042 < 2.2e-16 ***
## hum                      1 4.8328e+08  483284498 1450.012 < 2.2e-16 ***
## wind_speed               1 1.0116e+08  101157521  303.506 < 2.2e-16 ***
## factor(weather_code)     6 1.0649e+08   17748200   53.250 < 2.2e-16 ***
## factor(season)           3 1.1904e+07    3967975   11.905 8.772e-08 ***
## I(is_weekend)            1 1.2546e+08  125458600  376.417 < 2.2e-16 ***
## I(is_holiday)            1 2.2719e+07   22719234   68.165 < 2.2e-16 ***
## Residuals            15197 5.0651e+09     333297                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#REsiduals Analysis

checkresiduals(md_trend.2)

## 
##  Breusch-Godfrey test for serial correlation of order up to 43
## 
## data:  Residuals
## LM test = 10716, df = 43, p-value < 2.2e-16

We can see that from the residual lag dependance that our model did not capture all of the variation in the data #we try ofseeting this effect by estimating residuals with ARIMA errors

#goal estimate residuals with ARMA process and add it to the model #make auto arima on residulas

MK I

library('forecast')
MyData.ts<-ts(MyData)
#MyData.ts
cnt_md<-auto.arima(MyData.ts[,2])
#cnt_md$residuals

MyDATA.prot.2 <- data.frame(MyData)
MyDATA.prot.2$residuals <- cnt_md$residuals
#head(MyDATA.prot.2)

#uses ARIMA(2,1,1) to model noise 

h <- length(MyDATA.prot.2$time)/8 # setting a testing partition length
   train <- MyDATA.prot.2[1:(nrow(MyDATA.prot.2) - h), ]
   test <- MyDATA.prot.2[(nrow(MyDATA.prot.2) - h + 1):nrow(MyDATA.prot.2), ]

md_pred.res<-lm(cnt ~ trend + seasonal + t1 + hum + wind_speed + factor(weather_code) + factor(season)+ I(is_weekend) +I(is_holiday)+ residuals , data = train) 


train$yhat <- predict(md_pred.res, newdata = train) #+ train$residuals
test$yhat <- predict(md_pred.res, newdata = test) #+ test$residuals


plot_lm(data =MyDATA.prot.2,
            train = train,
            test = test,
            title = "Predicting the number of rides booked ")
summary(md_pred.res)
## 
## Call:
## lm(formula = cnt ~ trend + seasonal + t1 + hum + wind_speed + 
##     factor(weather_code) + factor(season) + I(is_weekend) + I(is_holiday) + 
##     residuals, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2399.3  -260.9   -19.2   258.4  4032.7 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.227e+03  4.681e+01  26.217  < 2e-16 ***
## trend                   2.384e-03  1.073e-03   2.222 0.026275 *  
## seasonal1              -1.107e+02  3.016e+01  -3.670 0.000243 ***
## seasonal2              -1.688e+02  3.022e+01  -5.585 2.38e-08 ***
## seasonal3              -2.123e+02  3.025e+01  -7.019 2.33e-12 ***
## seasonal4              -2.317e+02  3.028e+01  -7.652 2.09e-14 ***
## seasonal5              -2.214e+02  3.036e+01  -7.294 3.17e-13 ***
## seasonal6              -4.590e+01  3.081e+01  -1.490 0.136229    
## seasonal7               6.145e+02  3.254e+01  18.883  < 2e-16 ***
## seasonal8               1.800e+03  3.398e+01  52.984  < 2e-16 ***
## seasonal9               1.852e+03  3.273e+01  56.571  < 2e-16 ***
## seasonal10              2.966e+02  3.127e+01   9.484  < 2e-16 ***
## seasonal11              4.962e+02  3.108e+01  15.966  < 2e-16 ***
## seasonal12              7.554e+02  3.129e+01  24.143  < 2e-16 ***
## seasonal13              8.361e+02  3.143e+01  26.601  < 2e-16 ***
## seasonal14              7.435e+02  3.156e+01  23.563  < 2e-16 ***
## seasonal15              7.560e+02  3.165e+01  23.882  < 2e-16 ***
## seasonal16              1.018e+03  3.173e+01  32.089  < 2e-16 ***
## seasonal17              1.694e+03  3.338e+01  50.738  < 2e-16 ***
## seasonal18              2.230e+03  3.089e+01  72.188  < 2e-16 ***
## seasonal19              1.401e+03  3.086e+01  45.382  < 2e-16 ***
## seasonal20              6.604e+02  3.038e+01  21.739  < 2e-16 ***
## seasonal21              4.849e+02  3.035e+01  15.975  < 2e-16 ***
## seasonal22              3.134e+02  3.021e+01  10.374  < 2e-16 ***
## seasonal23              1.681e+02  3.019e+01   5.567 2.63e-08 ***
## t1                      3.586e+01  1.368e+00  26.203  < 2e-16 ***
## hum                    -1.352e+01  4.783e-01 -28.276  < 2e-16 ***
## wind_speed             -7.733e+00  6.198e-01 -12.477  < 2e-16 ***
## factor(weather_code)2   2.166e+01  1.237e+01   1.751 0.079912 .  
## factor(weather_code)3  -1.141e+01  1.372e+01  -0.832 0.405601    
## factor(weather_code)4  -5.517e+01  1.861e+01  -2.965 0.003028 ** 
## factor(weather_code)7  -2.132e+02  1.674e+01 -12.735  < 2e-16 ***
## factor(weather_code)10 -5.255e+02  1.441e+02  -3.646 0.000267 ***
## factor(weather_code)26  1.696e+02  7.627e+01   2.223 0.026207 *  
## factor(season)1         3.296e+01  1.542e+01   2.137 0.032596 *  
## factor(season)2         3.069e+01  1.451e+01   2.115 0.034433 *  
## factor(season)3        -6.288e+01  1.303e+01  -4.824 1.42e-06 ***
## I(is_weekend)          -2.060e+02  9.711e+00 -21.215  < 2e-16 ***
## I(is_holiday)          -2.782e+02  3.116e+01  -8.928  < 2e-16 ***
## residuals               5.362e-01  1.087e-02  49.346  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 536.6 on 15197 degrees of freedom
## Multiple R-squared:  0.7617, Adjusted R-squared:  0.7611 
## F-statistic:  1246 on 39 and 15197 DF,  p-value: < 2.2e-16
#plot(md_pred.res)
checkresiduals(md_pred.res)

## 
##  Breusch-Godfrey test for serial correlation of order up to 43
## 
## data:  Residuals
## LM test = 10843, df = 43, p-value < 2.2e-16
mape_md_pred.res <- c(mean(abs(train$cnt - train$yhat) / train$cnt),
                 mean(abs(test$cnt - test$yhat) / test$cnt))
mape_md_pred.res
## [1]      Inf 1.113871
#residuals are modeleled by an ARIMA((2,1,1)) linear process 
#not actual residulas